This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style
Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep
hep.style.use("CMS" )
import warnings
warnings.filterwarnings('ignore' )
seed = 1234
np.random.seed(seed)
random.seed(seed)
plt.rcParams['savefig.facecolor' ] = "0.8"
plt.rcParams.update({'font.size' : 13 })
# create first set
# create number which follow normal distribution
feature_a = np.random.normal(500 , 25 , size= 1000 )
feature_b = np.random.normal(1023 , 19 , size= 1000 )
feature_d = np.random.uniform(low= 0 , high= 60 , size= (1000 ,)).astype(int )
feature_e = np.random.choice(a= 40 , size= 1000 )
# create number which follow normal distribution but skew
numValues = 1000
maxValue = 100
skewness = - 5 #Negative values are left skewed, positive values are right skewed.
random = skewnorm.rvs(a = skewness,loc= maxValue, size= numValues) #Skewnorm function
random = random - min (random) #Shift the set so the minimum value is equal to zero.
random = random / max (random) #Standadize all the vlues between 0 and 1.
feature_c = random * maxValue #Multiply the standardized values by the maximum value.
# create number which follow power law
feature_f = np.random.power(1.75 , numValues)
Code
# prepare to create dataframe
list_1 = feature_a.tolist()
list_2 = feature_b.tolist()
list_3 = feature_c.tolist()
list_4 = feature_d.tolist()
list_5 = feature_e.tolist()
list_6 = feature_f.tolist()
data_set_1 = {'feature A' : list_1, 'feature B' : list_2, 'feature C' : list_3,
'feature D' : list_4, 'feature E' : list_5, 'feature F' : list_6}
df_set_1 = pd.DataFrame(data_set_1)
df_set_1.head(15 )
Table 1: Random Data based on distribution
0
511.785879
992.938048
76.072909
38
22
0.857588
1
470.225608
992.216386
65.754836
23
36
0.948207
2
535.817674
1023.884676
74.986424
58
20
0.903021
3
492.183703
991.083252
58.718138
26
18
0.837142
4
481.985282
1049.521954
84.559789
48
18
0.653904
5
522.179074
1006.945543
80.584231
58
18
0.713595
6
521.489710
1038.466132
79.887988
35
19
0.706957
7
484.086912
1022.054896
77.791707
38
33
0.533313
8
500.392409
1033.150688
66.112721
36
33
0.787118
9
443.932876
1007.666827
74.728363
38
22
0.320662
10
528.750893
1038.991174
80.805396
49
7
0.650921
11
524.798651
1012.394720
74.509873
3
6
0.695892
12
523.833103
1011.046008
63.021237
16
4
0.322784
13
449.468629
1045.375675
55.890691
15
14
0.748238
14
491.648066
1006.943466
27.852004
59
12
0.624733
Some data plot for Table 1
Code
fig, ax = plt.subplots(nrows= 2 , ncols= 2 )
plt.tight_layout(pad= .95 )
df_set_1.plot(x = "feature A" ,y= "feature F" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[0 ][0 ])
df_set_1.plot(x = "feature A" ,y= "feature B" , kind= "scatter" , ax = ax[0 ][0 ], color = "b" , alpha = .2 )
ax[0 ][0 ].set_ylabel("feature F and feature B" );
df_set_1.plot(x = "feature B" ,y= "feature F" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[0 ][1 ])
df_set_1.plot(x = "feature B" ,y= "feature E" , kind= "scatter" , ax = ax[0 ][1 ], color = "b" , alpha = .2 )
ax[0 ][1 ].set_ylabel("feature F and feature E" );
df_set_1.plot(x = "feature C" ,y= "feature A" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[1 ][0 ])
df_set_1.plot(x = "feature C" ,y= "feature D" , kind= "scatter" , ax = ax[1 ][0 ], color = "b" , alpha = .2 )
ax[1 ][0 ].set_ylabel("feature A and feature D" );
df_set_1.plot(x = "feature F" ,y= "feature A" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[1 ][1 ])
df_set_1.plot(x = "feature F" ,y= "feature C" , kind= "scatter" , ax = ax[1 ][1 ], color = "b" , alpha = .2 )
ax[1 ][1 ].set_ylabel("feature A and feature C" );
Code
# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
cov = np.array([[1 , 0.8 ,.7 , .6 ],[.8 ,1. ,.5 ,.5 ],[0.7 ,.5 ,1. ,.5 ],[0.6 ,.5 ,.5 ,1 ]])
scores_ = mvn.rvs(mean = [- 200. ,- 200. ,- 200. ,- 200. ], cov= cov, size = 1000 )
df_set_2 = pd.DataFrame(data = scores_, columns = ["feature E" , "feature F" , "feature H" , "feature G" ])
df_set_2.head(15 )
Table 2: Random Data
0
-199.495066
-199.908783
-199.196850
-199.866409
1
-202.273674
-202.157820
-200.138768
-201.427624
2
-200.831163
-199.377240
-201.399875
-201.394457
3
-200.673144
-200.383514
-201.184697
-199.566227
4
-200.083799
-200.251282
-200.132608
-199.823847
5
-201.451594
-200.635146
-201.021892
-200.708970
6
-199.810856
-200.173395
-198.696014
-199.614389
7
-201.341426
-201.206530
-202.029928
-200.957953
8
-200.043966
-199.170888
-200.119988
-200.141972
9
-198.519039
-199.157659
-198.907071
-200.315265
10
-198.008390
-199.410725
-199.420994
-199.457605
11
-200.675343
-200.700744
-201.040379
-200.743832
12
-201.649402
-200.300023
-201.838069
-200.687691
13
-198.625462
-198.742747
-198.052051
-198.507148
14
-199.316719
-200.655931
-199.272346
-200.457617
Some data plot for Table 2
Code
fig, ax = plt.subplots(nrows= 1 , ncols= 2 )
plt.tight_layout(pad= 3 )
df_set_2.plot(x = "feature E" ,y= "feature G" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[0 ], label = "feature G, $corr_{feature E}$ = .6" )
df_set_2.plot(x = "feature E" ,y= "feature F" , kind= "scatter" , ax = ax[0 ], color = "b" , alpha = .2 , label = "feature F, $corr_{feature E}$ = .6" )
ax[0 ].set_ylabel("feature G and feature F" );
df_set_2.plot(x = "feature F" ,y= "feature H" , kind= "scatter" , color = "r" , alpha = .5 , ax= ax[1 ])
df_set_2.plot(x = "feature F" ,y= "feature G" , kind= "scatter" , ax = ax[1 ], color = "b" , alpha = .2 )
ax[1 ].set_ylabel("feature G and feature H" );